Identification of butyrate-producing bacteria

We start with the microSubt phyloseq object created in the Analysis-OleateMicrobiota file. We create a csv file containing the major butyrate-producing microbiota reported in literature called butyrate.csv. This file was created from the silva v138.1 taxonomy file. Then, we filter the microSubt with genera from butyrate.csv.

# load libraries
library("phyloseq")
library("ggplot2")
library("plyr")
library("vegan")
library("grid")
library("directlabels")
library("knitr")
library("clustsig")
library("ellipse")
library("ape")
library("DESeq2")
library("microbial")
library("VennDiagram")
library("microbiome")
library("microbiomeutilities")
## Warning: replacing previous import 'ggplot2::alpha' by 'microbiome::alpha' when
## loading 'microbiomeutilities'
library("viridis")
library("RColorBrewer")
library("ggpubr")
library("patchwork")

Make a phyloseq object from mothur files

## import data 
sharedFile = read.table("oleate.opti_mcc.shared")
sharedFile = t(sharedFile) ## transform data
rownames(sharedFile) = sharedFile[,1]  ## define rowname
colnames(sharedFile) = sharedFile[2,]  ## define column names
sharedFile <-as.matrix(sharedFile)
sharedFile = sharedFile[,2:57]       ## only include columns that reflects OTU numbers (remember order [row,colum]) 2 -> 57 for me
## as I had 57 samples
sharedFile = sharedFile[4:819,]     ## and which rows you want. I have 819 OTUs so I changed this number to 819
# original => sharedFile = sharedFile[4:819,]
class(sharedFile) <- "numeric"
head(sharedFile)  ## look at head first 7 lines to see it's made correclty
##        ce4406 ce4407 ce4408 ce4410 ce4411 ce4412 ce4413 ce4414 ce4415 ce4416
## Otu001    648    490    686    411    726    838    761    175    144   1520
## Otu002      1    143   1693   1037    794    548    356    418    149   1075
## Otu003    867    743   1670    733   1171    559    953    165    202    564
## Otu004    310      8   1379    808    903    419   1487    953     46    192
## Otu005    483    704   1357   2703    825    735    381    416    188   2496
## Otu006      0      0      0      1    170      2      4     49     40    365
##        ce4417 ce4418 ce4419 ce4420 ce4421 ce4422 ce4423 ce4424 co4406 co4407
## Otu001   1118   1091   1691    703    936   1011     54   1452   1170    257
## Otu002   2412   6002   1290    586   1216   1117   2346     48      5    843
## Otu003   1201   1204    871    980    890    922   1522    388    406   1539
## Otu004     29    291   1469     21      6      4      3     20   1066     14
## Otu005   2134    969   1805   1737   1385   1602   1220    391    292    188
## Otu006    123   1317     97    245    237    272    146   2061      3      2
##        co4408 co4410 co4411 co4412 co4413 co4414 co4415 co4416 co4417 co4418
## Otu001   1823    856    397    705    356    392     45    201   4331    356
## Otu002    767    177    220    404     24    442    261   1187   1901   1269
## Otu003   1116    517    909    996    394    499    490    498    664   1311
## Otu004   1089    564    518    468    660   1151     27    133    481     13
## Otu005    264    400    795    778    310    875    168     31    804    483
## Otu006      0      0     59      1      1    110     19    195   1149     66
##        co4419 co4420 co4421 co4422 co4423 co4424 co4428 f4406 f4407 f4408 f4410
## Otu001    182    188   1372    944   1621    233   1166  7021  9127  2210  3181
## Otu002   2380   1814    815    390    778   2231     10    31  1790    14   173
## Otu003    567    499    527    791   1318   1249    225  1108  1447   679   726
## Otu004     31    484     35      0     18      4      6  1221  2604   386  2143
## Otu005    281    212    501    519    584    237    148  1300   380  1279   628
## Otu006    240     39    280    237    667    376   1572     3     0     0     1
##        f4411 f4412 f4413 f4414 f4415 f4416 f4417 f4418 f4419 f4420 f4421 f4422
## Otu001  6323  8943  2011  4584  4530  7401  3309  6001  7938  4862  1619  1334
## Otu002  1367    30    99  2063  1193   898   472   265  3751  3667  1244  1022
## Otu003  1242  1133  1532  1163   687   902   536   443  1878   554  1781  1668
## Otu004  2441   716  1082  4784  4027  3709  3814  2177  1569  2593    80    21
## Otu005  1154  1847  1024   816   591  1263  1511   521  1505  1013   272   558
## Otu006   786     1     0  1648   862  1234  1175   931  2214  1074  2252  1979
##        f4423 f4424 f4428
## Otu001  3749 11842   791
## Otu002  1421  1194   996
## Otu003   610  2459   971
## Otu004   147    23     5
## Otu005  1367    64   324
## Otu006   798   506  1765
## Import subsampled otu matrix (26880 seqs)
sharedsubFile = read.table('oleate.opti_mcc.0.03.subsample.shared')
sharedsubFile = t(sharedsubFile)
rownames(sharedsubFile) = sharedsubFile[,1]
colnames(sharedsubFile) = sharedsubFile[2,]
sharedsubFile = sharedsubFile[,2:57]
sharedsubFile = sharedsubFile[4:494,]
class(sharedsubFile) <- "numeric"
head(sharedsubFile)
##        ce4406 ce4407 ce4408 ce4410 ce4411 ce4412 ce4413 ce4414 ce4415 ce4416
## Otu001    335    271    174     72    204    358    233    131    144    312
## Otu002      1     68    361    199    235    236    103    311    149    237
## Otu003    468    380    379    139    359    254    289    125    202    108
## Otu004    145      2    312    128    261    186    453    680     46     37
## Otu005    245    348    299    492    254    319    124    308    188    509
## Otu006      0      0      0      0     38      1      1     31     40     84
##        ce4417 ce4418 ce4419 ce4420 ce4421 ce4422 ce4423 ce4424 co4406 co4407
## Otu001    187    161    311    109    212    197     10    137    456    145
## Otu002    414    825    219     86    255    194    590      3      2    528
## Otu003    213    166    156    157    197    176    380     30    160    916
## Otu004      6     38    284      3      0      1      0      1    407      9
## Otu005    412    134    311    262    300    295    299     34    128    109
## Otu006     20    173     20     46     47     44     30    184      2      2
##        co4408 co4410 co4411 co4412 co4413 co4414 co4415 co4416 co4417 co4418
## Otu001    473    406    172    289    160    168     41    127    611    138
## Otu002    224     82    101    156     11    181    240    770    281    465
## Otu003    304    272    445    419    180    214    459    308     85    543
## Otu004    317    258    253    167    321    469     27     86     69      7
## Otu005     83    202    400    294    147    373    155     19    105    172
## Otu006      0      0     24      0      0     44     18    121    169     18
##        co4419 co4420 co4421 co4422 co4423 co4424 co4428 f4406 f4407 f4408 f4410
## Otu001     84     80    358    335    340     72    237   933  1161   548   615
## Otu002    873    792    208    118    166    744      1     4   230     2    29
## Otu003    212    202    130    290    274    413     48   141   199   168   115
## Otu004     18    223      9      0      4      1      2   151   360   105   410
## Otu005    111     73    132    184    122     82     24   154    52   311   113
## Otu006     95     14     68     96    126    127    293     0     0     0     0
##        f4411 f4412 f4413 f4414 f4415 f4416 f4417 f4418 f4419 f4420 f4421 f4422
## Otu001   929  1220   419   585   727   779   584  1196   788   533   250   228
## Otu002   196     5    22   259   167   105    74    49   390   416   224   144
## Otu003   171   167   296   144   124    95    99    87   195    75   271   261
## Otu004   384    99   204   639   655   415   673   421   145   281    15     5
## Otu005   179   283   209    99   101   125   256    93   145   131    40    87
## Otu006   106     0     0   177   135   128   212   197   229   116   332   338
##        f4423 f4424 f4428
## Otu001   401  1571   135
## Otu002   158   159   161
## Otu003    65   348   150
## Otu004    12     3     0
## Otu005   137    11    59
## Otu006    95    66   275
dim(sharedsubFile)
## [1] 491  56
sharedsubFile <-as.matrix(sharedsubFile)
## Import taxonomy file 
## Copy and paste into notepad and save. Then carry on: 
taxFile = read.table('oleate.taxonomy', header=T, sep='\t')
rownames(taxFile) = taxFile[,1]
taxFile = taxFile[,3:8]
taxFile = as.matrix(taxFile)
head(taxFile)
##        Kingdom    Phylum              Class              Order               
## Otu001 "Bacteria" "Firmicutes"        "Bacilli"          "Lactobacillales"   
## Otu002 "Bacteria" "Verrucomicrobiota" "Verrucomicrobiae" "Verrucomicrobiales"
## Otu003 "Bacteria" "Bacteroidota"      "Bacteroidia"      "Bacteroidales"     
## Otu004 "Bacteria" "Firmicutes"        "Bacilli"          "Lactobacillales"   
## Otu005 "Bacteria" "Firmicutes"        "Clostridia"       "Lachnospirales"    
## Otu006 "Bacteria" "Firmicutes"        "Bacilli"          "Erysipelotrichales"
##        Family                Genus            
## Otu001 "Streptococcaceae"    "Lactococcus"    
## Otu002 "Akkermansiaceae"     "Akkermansia"    
## Otu003 "Bacteroidaceae"      "Bacteroides"    
## Otu004 "Lactobacillaceae"    "Lactobacillus"  
## Otu005 "Lachnospiraceae"     "Lachnospiraceae"
## Otu006 "Erysipelotrichaceae" "Dubosiella"
## import metadata file
metaFile = read.table('oleate.metadata', header=T, sep='\t')
rownames(metaFile) = metaFile[,1]
metaFile = metaFile[,1:4]
head(metaFile)
##         group  site mouse condition
## ce4406 ce4406 cecum  4406    10%Fat
## ce4407 ce4407 cecum  4407    10%Fat
## ce4408 ce4408 cecum  4408    10%Fat
## ce4410 ce4410 cecum  4410    10%Fat
## ce4411 ce4411 cecum  4411    10%Fat
## ce4412 ce4412 cecum  4412    10%Fat
## Create phyloseq object
OTU = otu_table(sharedFile, taxa_are_rows = TRUE)
OTUsub = otu_table(sharedsubFile, taxa_are_rows = TRUE)
TAX = tax_table(taxFile)
META = sample_data(metaFile)
physeq = phyloseq(OTU, TAX, META)
physeqSub = phyloseq(OTUsub, TAX, META)
physeqSub
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 491 taxa and 56 samples ]
## sample_data() Sample Data:       [ 56 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 491 taxa by 6 taxonomic ranks ]
## Get rid of any OTUs not present in any samples and get relative abundance
microSub <- prune_taxa(taxa_sums(physeqSub) > 0, physeqSub)
microSubRel = transform_sample_counts(microSub, function(x) x / sum(x) )
microSubRelFilt = filter_taxa(microSubRel, function(x) mean(x) > 1e-5, TRUE)
# for subsampled shared file
# create a tree and a new phyloseq object
random_tree = rtree(ntaxa(microSub), rooted=TRUE, tip.label=taxa_names(microSub))
plot(random_tree)

microSubt = merge_phyloseq(microSub, random_tree)

Select Butyrate-forming bacteria from the microSubt phyloseq object

but <- read.csv("butyrate.csv", header = TRUE, sep = ",")
head(but$Genus)
## [1] "Anaerostipes" "Anaerostipes" "Anaerostipes" "Anaerostipes" "Anaerostipes"
## [6] "Anaerostipes"
microBut <- subset_taxa(microSubt, Genus %in% but$Genus)
microBut
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 92 taxa and 56 samples ]
## sample_data() Sample Data:       [ 56 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 92 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 92 tips and 91 internal nodes ]

Plotting core graphs

# Plotting rarefaction
rarecurve(t(otu_table(microBut)), step=50, cex=0.5)

# Plotting taxonomy
plot_bar(microBut, fill="Phylum") + facet_wrap(~condition, scales = "free_x", nrow = 1)+theme(text = element_text(size=16))+geom_bar(stat="identity")+
  scale_fill_brewer(type = "seq", palette = "Spectral")+
  theme(
    legend.background = element_rect(
      fill = "lemonchiffon", 
      colour = "grey50", 
      size = 1
    )
  )

plot_bar(microBut, fill="Phylum") + facet_wrap(~site, scales = "free_x", nrow = 1)+theme(text = element_text(size=16))+geom_bar(stat="identity")+
  scale_fill_brewer(type = "seq", palette = "Spectral")+
  theme(
    legend.background = element_rect(
      fill = "lemonchiffon", 
      colour = "grey50", 
      size = 1
    )
  ) 

plot_bar(microBut, fill="Phylum")+facet_grid(condition~site, scales = "free")+theme(text = element_text(size=16))+geom_bar(stat="identity")+
  scale_fill_brewer(type = "seq", palette = "Spectral")+
  theme(
    legend.background = element_rect(
      fill = "lemonchiffon", 
      colour = "grey50", 
      size = 1
    )
  )

# Plotting PCoA
my.PCoA2 <- ordinate(microBut, "PCoA", "bray")
plot_ordination(microBut, my.PCoA2, type = "group", color = "condition", shape = "site")+theme(text = element_text(size=20))+ geom_point(size=5) + scale_color_manual(values=c("brown3", "steelblue","grey50"))+ 
  geom_text(label= "group", size=2)
## Warning in plot_ordination(microBut, my.PCoA2, type = "group", color = "condition", : type argument not supported. `type` set to 'samples'.
## See `plot_ordination('list')`

plot_ordination(microBut, my.PCoA2, type = "group", color = "condition")+ facet_wrap(~site, scales = "free_x", nrow = 1) + geom_point(size=5)+theme(text = element_text(size=20)) + scale_color_manual(values=c("brown3", "steelblue","grey50"))
## Warning in plot_ordination(microBut, my.PCoA2, type = "group", color = "condition"): type argument not supported. `type` set to 'samples'.
## See `plot_ordination('list')`

plot_ordination(microBut, my.PCoA2, type = "group", color = "condition", shape = "site")+theme(text = element_text(size=20))+ geom_point(size=5)+ 
  geom_line() + scale_color_manual(values=c("brown3", "steelblue","grey50"))
## Warning in plot_ordination(microBut, my.PCoA2, type = "group", color = "condition", : type argument not supported. `type` set to 'samples'.
## See `plot_ordination('list')`

plot_ordination(microBut, my.PCoA2, type = "group", color = "condition")+theme(text = element_text(size=20))+ geom_point(size=5)+ stat_ellipse(level=0.9)  + scale_color_manual(values=c("brown3", "steelblue","grey50"))
## Warning in plot_ordination(microBut, my.PCoA2, type = "group", color = "condition"): type argument not supported. `type` set to 'samples'.
## See `plot_ordination('list')`

# PCoA plot using the unweighted UniFrac as distance
wunifrac_dist = phyloseq::distance(microBut, method="unifrac", weighted=F)
ordination = ordinate(microBut, method="PCoA", distance=wunifrac_dist)
plot_ordination(microBut, ordination, color="condition") + theme(aspect.ratio=1)+
  ggtitle("PCoA: unweigthed Unifrac")+geom_point(size=5)  + scale_color_manual(values=c("brown3", "steelblue","grey50"))

#Plot PCoA using the weighted UniFrac as distance
wunifrac_dist = phyloseq::distance(microBut, method="unifrac", weighted=T)
ordUF = ordinate(microBut, method="PCoA", distance=wunifrac_dist)
plot_ordination(microBut, ordUF, color = "condition") + 
  ggtitle("PCoA: weigthed Unifrac")+geom_point(size=5) + scale_color_manual(values=c("brown3", "steelblue","grey50"))

# Normalization

phy <- normalize(microBut, method = "relative")
## Normalization using relative method
# Plot taxonomy by group
plotbar(phy,level="Phylum", group="condition") + theme(axis.text.x = element_text(size = 16))

plotbar(phy,level="Phylum", group="site") + theme(axis.text.x = element_text(size = 16))

# Plot alpha diversity
plotalpha(microBut, group = "condition", color = c("brown3","grey50", "steelblue"))

plotalpha(microBut, group = "site")
## No significant difference between any of the groups

# Test for significance between groups
beta_condition <-betatest(phy,group="condition")
## Do PERMANOVA for: condition
beta_site <-betatest(phy,group="site")
## Do PERMANOVA for: site
# Biomarkers discovery and plotting
bio <- biomarker(microBut,group="condition")
## Normalization using relative method
plotmarker(bio,level="Genus")

# LefSe testing and plotting
lda3 <- ldamarker(microBut,group="condition")
## Normalization using relative method
lda3
## # A tibble: 42 x 14
## # Groups:   rank, tax [42]
##    rank   tax      statistic p.value parameter method    p.adj `10%Fat` `60%Fat`
##    <chr>  <chr>        <dbl>   <dbl>     <int> <chr>     <dbl>    <dbl>    <dbl>
##  1 Class  Bacteri…      6.25 4.39e-2         2 Kruska… 6.92e-2    2912.  112564.
##  2 Class  Bacteri…      6.72 3.47e-2         2 Kruska… 6.92e-2  457315.  117648.
##  3 Class  Bacteri…      3.09 2.13e-1         2 Kruska… 2.50e-1      NA       NA 
##  4 Family Bacteri…      6.25 4.39e-2         2 Kruska… 6.92e-2    2912.  112564.
##  5 Family Bacteri…      7.14 2.82e-2         2 Kruska… 6.42e-2  418131.   79221.
##  6 Family Bacteri…      8.65 1.32e-2         2 Kruska… 3.87e-2       0     5937.
##  7 Family Bacteri…      2.79 2.47e-1         2 Kruska… 2.60e-1      NA       NA 
##  8 Family Bacteri…      3.87 1.44e-1         2 Kruska… 1.97e-1      NA       NA 
##  9 Family Bacteri…     35.6  1.88e-8         2 Kruska… 2.57e-7   14337. 2215068.
## 10 Family Bacteri…     29.9  3.24e-7         2 Kruska… 2.21e-6 3906047. 2229724.
## # … with 32 more rows, and 5 more variables: Oleate <dbl>, max <dbl>,
## #   min <dbl>, LDAscore <dbl>, direction <chr>
write.csv(lda3, "lda3.csv", row.names = FALSE)
lda4 <- read.csv(file = 'lda4.csv')
plotLDA(lda4,group=c("10%Fat","60%Fat"),lda=5,pvalue=0.05)+theme(axis.text.y = element_text(size = 16))+theme(text = element_text(size=20))

plotLDA(lda4,group=c("10%Fat","Oleate"),lda=5,pvalue=0.05)+theme(axis.text.y = element_text(size = 16))+theme(text = element_text(size=20))

plotLDA(lda4,group=c("Oleate","60%Fat"),lda=5,pvalue=0.05) +theme(axis.text.y = element_text(size = 16))+theme(text = element_text(size=20))

## Modified difftest function from microbial package to test for significance

# create difftest2 function
difftest2<-function(physeq,group,pvalue=0.05,padj=NULL,log2FC=0,gm_mean=TRUE,fitType="local",quiet=FALSE){
  if(!taxa_are_rows(physeq)){
    physeq<-t(physeq)
  }
  otu <- as(otu_table(physeq),"matrix")
  tax <- as.data.frame(as.matrix(tax_table(physeq)))
  colData<-as(sample_data(physeq),"data.frame")
  colData$condition<-colData[,group]
  contrasts<-levels(factor(unique(colData$condition)))[1:2]
  if(isTRUE(gm_mean)){
    countData<-round(otu, digits = 0)
  }else{
    countData<-round(otu, digits = 0)+1
  }
  dds <- DESeqDataSetFromMatrix(countData, colData, as.formula(~condition))
  if(isTRUE(gm_mean)){
    geoMeans = apply(counts(dds), 1, gm_mean)
    dds = estimateSizeFactors(dds, geoMeans = geoMeans)
  }
  dds <- DESeq(dds, fitType=fitType)
  res <- results(dds,contrast=c("condition",contrasts),cooksCutoff = FALSE)
  res_tax = cbind(as.data.frame(res), as.matrix(countData[rownames(res), ]))
  if(!is.null(padj)){
    pval<-padj
    sig <- rownames(subset(res,padj<pval&abs(log2FoldChange)>log2FC))
  }else{
    pval<-pvalue
    sig <- rownames(subset(res,pvalue<pval&abs(log2FoldChange)>log2FC))
  }
  res_tax$Significant<- "No"
  res_tax$Significant <- ifelse(rownames(res_tax) %in% sig, "Yes", "No")
  res_tax <- cbind(res_tax, tax[rownames(res),])
  return(as.data.frame(res_tax))
}
# Test for significant bacteria using microbial package
mic_res <- difftest2(microBut,group="condition", gm_mean=FALSE)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 2 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
plotdiff(mic_res,level="Genus",padj=0.001,log2FC = 3,fontsize.y = 14)

## Test for significant OTUs using DESeq2

sample_data(microBut)$condition <- as.factor(sample_data(microBut)$condition)
ds = phyloseq_to_deseq2(microBut, ~ condition)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
ds = DESeq(ds)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 2 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
alpha = 0.01
# compare Oleate and 60%Fat
res = results(ds, contrast=c("condition", "Oleate", "60%Fat"), alpha=alpha)
res = res[order(res$padj, na.last=NA), ]
res_sig = res[(res$padj < alpha), ]
res_sig
## log2 fold change (MLE): condition Oleate vs 60%Fat 
## Wald test p-value: condition Oleate vs 60%Fat 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## Otu028  31.63472        5.66889  0.751470   7.54374 4.56671e-14 4.20138e-12
## Otu031  27.91385        3.69359  0.552382   6.68665 2.28335e-11 1.05034e-09
## Otu075   5.48373        4.18206  0.753635   5.54919 2.87003e-08 8.80143e-07
## Otu052  11.59919        2.40842  0.493055   4.88470 1.03587e-06 2.38251e-05
## Otu029  27.86040       -4.29768  0.908277  -4.73168 2.22666e-06 4.09706e-05
## Otu097   1.10108       -4.44731  1.032066  -4.30913 1.63897e-05 2.51309e-04
res_sig = cbind(as(res_sig, "data.frame"), as(tax_table(microBut)[rownames(res_sig), ], "matrix"))
ggplot(res_sig, aes(x=Genus, y=log2FoldChange, color=Phylum)) +
  geom_jitter(size=5, width = 0.2) + theme(text = element_text(size=16)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

# compare 10%Fat and 60%Fat
res2 = results(ds, contrast=c("condition", "10%Fat", "60%Fat"), alpha=alpha)
res2 = res2[order(res2$padj, na.last=NA), ]
res_sig2 = res2[(res2$padj < alpha), ]
res_sig2
## log2 fold change (MLE): condition 10%Fat vs 60%Fat 
## Wald test p-value: condition 10%Fat vs 60%Fat 
## DataFrame with 7 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## Otu029  27.86040       -7.83160  0.985919  -7.94345 1.96632e-15 1.80901e-13
## Otu028  31.63472        5.48619  0.751341   7.30186 2.83816e-13 1.30555e-11
## Otu075   5.48373        4.19824  0.753127   5.57441 2.48367e-08 7.61660e-07
## Otu031  27.91385        3.03387  0.553472   5.48153 4.21672e-08 9.69845e-07
## Otu097   1.10108       -4.22887  1.042150  -4.05783 4.95301e-05 9.11354e-04
## Otu052  11.59919        1.97014  0.495261   3.97798 6.95032e-05 1.06572e-03
## Otu077   4.86304        2.49440  0.724857   3.44124 5.79065e-04 7.61057e-03
res_sig2 = cbind(as(res_sig2, "data.frame"), as(tax_table(microBut)[rownames(res_sig2), ], "matrix"))
ggplot(res_sig2, aes(x=Genus, y=log2FoldChange, color=Phylum)) +
  geom_jitter(size=5, width = 0.2) + theme(text = element_text(size=16)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

# compare 10%Fat and Oleate
res3 = results(ds, contrast=c("condition", "10%Fat", "Oleate"), alpha=alpha)
res3 = res3[order(res3$padj, na.last=NA), ]
res_sig3 = res3[(res3$padj < alpha), ]
res_sig3
## log2 fold change (MLE): condition 10%Fat vs Oleate 
## Wald test p-value: condition 10%Fat vs Oleate 
## DataFrame with 4 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## Otu055   6.82993       -2.78274  0.568278  -4.89679 9.74142e-07 8.96211e-05
## Otu041  13.67379       -2.08072  0.509738  -4.08193 4.46630e-05 2.05450e-03
## Otu029  27.86040       -3.53391  0.915537  -3.85994 1.13417e-04 3.47811e-03
## Otu060   8.21605        2.33845  0.634518   3.68540 2.28344e-04 5.25190e-03
res_sig3 = cbind(as(res_sig3, "data.frame"), as(tax_table(microBut)[rownames(res_sig3), ], "matrix"))
ggplot(res_sig3, aes(x=Genus, y=log2FoldChange, color=Phylum)) +
  geom_jitter(size=5, width = 0.2) + theme(text = element_text(size=16)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

## Plot heatmap

plot_taxa_heatmap(microBut,
                  subset.top = 20,
                  VariableA = "condition",
                  heatcolors = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
                  transformation = "log10"
)
## Top 20 OTUs selected
## log10, if zeros in data then log10(1+x) will be used
## First top taxa were selected and 
## then abundances tranformed to log10(1+X)
## Warning in transform(phyobj1, "log10"): OTU table contains zeroes. Using log10(1
## + x) transform.

## Venn Diagram

pseq<-microBut
table(meta(pseq)$condition, useNA = "always")
## 
## 10%Fat 60%Fat Oleate   <NA> 
##     21     14     21      0
# convert to relative abundances
pseq.rel <- microbiome::transform(pseq, "compositional")
disease_states <- unique(as.character(meta(pseq.rel)$condition))
print(disease_states)
## [1] "10%Fat" "Oleate" "60%Fat"
#Write a for loop to go through each of the disease_states one by one and combine identified core taxa into a list.
list_core <- c() # an empty object to store information

for (n in disease_states){ # for each variable n in DiseaseState
  #print(paste0("Identifying Core Taxa for ", n))
  
  ps.sub <- subset_samples(pseq.rel, condition == n) # Choose sample from DiseaseState by n
  
  core_m <- core_members(ps.sub, # ps.sub is phyloseq selected with only samples from g 
                         detection = 0.001, # 0.001 in atleast 90% samples 
                         prevalence = 0.75)
  print(paste0("No. of core taxa in ", n, " : ", length(core_m))) # print core taxa identified in each DiseaseState.
  list_core[[n]] <- core_m # add to a list core taxa for each group.
  #print(list_core)
}
## [1] "No. of core taxa in 10%Fat : 19"
## [1] "No. of core taxa in Oleate : 16"
## [1] "No. of core taxa in 60%Fat : 22"
print(list_core)
## $`10%Fat`
##  [1] "Otu061" "Otu077" "Otu080" "Otu052" "Otu012" "Otu041" "Otu063" "Otu075"
##  [9] "Otu026" "Otu050" "Otu031" "Otu070" "Otu044" "Otu028" "Otu018" "Otu048"
## [17] "Otu037" "Otu084" "Otu060"
## 
## $Oleate
##  [1] "Otu061" "Otu077" "Otu052" "Otu012" "Otu041" "Otu063" "Otu075" "Otu026"
##  [9] "Otu055" "Otu031" "Otu070" "Otu044" "Otu028" "Otu048" "Otu037" "Otu060"
## 
## $`60%Fat`
##  [1] "Otu078" "Otu079" "Otu029" "Otu052" "Otu012" "Otu083" "Otu041" "Otu063"
##  [9] "Otu026" "Otu050" "Otu055" "Otu031" "Otu070" "Otu044" "Otu018" "Otu048"
## [17] "Otu097" "Otu037" "Otu084" "Otu076" "Otu060" "Otu089"
# Specify colors and plot venn
# supplying colors in the order they appear in list_core
mycols <- c("brown3", "grey50", "steelblue")
venn.diagram(list_core,fill = mycols, filename = 'But_venn_diagramm.png',
             output=TRUE)
## [1] 1

Plotting top 4 genus

pa <- aggregate_taxa(microBut, "Genus")
top_four <- top_taxa(pa, 4)
top_four
## [1] "Bacteria_Firmicutes_Clostridia_Oscillospirales_Oscillospiraceae_uncultured"
## [2] "Bacteria_Firmicutes_Clostridia_Lachnospirales_Lachnospiraceae_uncultured"  
## [3] "Lachnoclostridium"                                                         
## [4] "Oscillibacter"
mycols <- c("brown3", "steelblue", "grey50")
p <- plot_listed_taxa(pa, top_four, 
                      group= "condition",
                      group.order = c("10%Fat","Oleate","60%Fat"),
                      group.colors = mycols,
                      add.violin = TRUE,
                      violin.opacity = 0.3,
                      dot.opacity = 0.25,
                      box.opacity = 0.25,
                      panel.arrange= "wrap")
## An additonal column Sam_rep with sample names is created for reference purpose
comps <- make_pairs(sample_data(pa)$condition)
p <- p + stat_compare_means(
  comparisons = comps,
  label = "p.format",
  tip.length = 0.05,
  method = "wilcox.test")
print(p + ylab("Relative abundance") + scale_y_continuous(labels = scales::percent))

## Get taxa summary by group(s)

p0 <- microSubt
p0.gen <- aggregate_taxa(microBut,"Genus")
x.d <- dominant_taxa(p0,level = "Genus", group="condition")
head(x.d$dominant_overview)
## # A tibble: 6 x 5
## # Groups:   condition [3]
##   condition dominant_taxa       n rel.freq rel.freq.pct
##   <chr>     <chr>           <int>    <dbl> <chr>       
## 1 10%Fat    Lachnospiraceae    14     66.7 67%         
## 2 60%Fat    Lachnospiraceae     9     64.3 64%         
## 3 Oleate    Lachnospiraceae     9     42.9 43%         
## 4 10%Fat    Lactococcus         6     28.6 29%         
## 5 Oleate    Lactococcus         6     28.6 29%         
## 6 Oleate    Akkermansia         4     19   19%
tx.sum1 <- taxa_summary(p0, "Phylum")
## Data provided is not compositional 
##  will first transform
p0.rel <- microbiome::transform(p0, "compositional")

grp_abund <- get_group_abundances(p0.rel, 
                                  level = "Phylum", 
                                  group="condition",
                                  transform = "compositional")
## An additonal column Sam_rep with sample names is created for reference purpose
mycols <- c("brown3", "steelblue","grey50")
# clean names 
grp_abund$OTUID <- gsub("p__", "",grp_abund$OTUID)
grp_abund$OTUID <- ifelse(grp_abund$OTUID == "", 
                          "Unclassified", grp_abund$OTUID)
mean.plot <- grp_abund %>% # input data
  ggplot(aes(x= reorder(OTUID, mean_abundance), # reroder based on mean abundance
             y= mean_abundance,
             fill=condition)) + # x and y axis 
  geom_bar(     stat = "identity", 
                position=position_dodge()) + 
  scale_fill_manual("condition", values=mycols) + # manually specify colors
  theme_bw() + # add a widely used ggplot2 theme
  ylab("Mean Relative Abundance") + # label y axis
  xlab("Phylum") + # label x axis
  coord_flip() # rotate plot 
mean.plot

## Plotting density of reads per sample

p0 <- microBut
print_ps(p0)
## 01] ntaxa = 92
## 02] nsamples = 56
## 03] nsamplesvariables = 4
## 04] nranks = 6
## 05] Min. number of reads = 93
## 06] Max. number of reads = 553
## 07] Total number of reads = 16479
## 08] Average number of reads = 294.27
## 09] Median number of reads = 296
## 10] Sparsity = 0.699340062111801
## 11] Number of singletons = 29
## 12] % of taxa that are singletons 
##   (i.e. exactly one read detected across all samples) = 31.5217391304348
kable(head(tax_table(p0)))
Kingdom Phylum Class Order Family Genus
Otu120 Bacteria Firmicutes Bacilli Lactobacillales Enterococcaceae Enterococcus
Otu159 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Lachnoclostridium
Otu437 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured
Otu061 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured
Otu360 Bacteria Firmicutes Clostridia Oscillospirales Oscillospiraceae uncultured
Otu078 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured
# reduce size for example
ps0 <- core(p0, detection = 10, prevalence = 20 / 100)
# Add a prefix to taxa labels
ps0.f2 <- format_to_besthit(ps0, prefix = "MyBug-")
kable(head(tax_table(ps0.f2))[3:6])
Domain Phylum Class Order Family Genus best_hit
MyBug-Otu012:uncultured Bacteria Firmicutes Clostridia Oscillospirales Oscillospiraceae uncultured MyBug-Otu012:uncultured
MyBug-Otu041:Oscillibacter Bacteria Firmicutes Clostridia Oscillospirales Oscillospiraceae Oscillibacter MyBug-Otu041:Oscillibacter
MyBug-Otu026:Lachnoclostridium Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Lachnoclostridium MyBug-Otu026:Lachnoclostridium
MyBug-Otu050:uncultured Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured MyBug-Otu050:uncultured
p <- plot_read_distribution(ps0.f2, groups = "condition", 
                            plot.type = "density")
print(p + theme_biome_utils())

ps0 <- core(p0, detection = 10, prevalence = 20 / 100)
pseq_df <- phy_to_ldf(ps0, transform.counts = NULL)
## An additonal column Sam_rep with sample names is created for reference purpose
kable(head(pseq_df))
OTUID Kingdom Phylum Class Order Family Genus Sam_rep Abundance group site mouse condition
Otu029 Bacteria Firmicutes Clostridia Clostridiales Clostridiaceae Clostridium ce4406 0 ce4406 cecum 4406 10%Fat
Otu052 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured ce4406 13 ce4406 cecum 4406 10%Fat
Otu012 Bacteria Firmicutes Clostridia Oscillospirales Oscillospiraceae uncultured ce4406 36 ce4406 cecum 4406 10%Fat
Otu041 Bacteria Firmicutes Clostridia Oscillospirales Oscillospiraceae Oscillibacter ce4406 1 ce4406 cecum 4406 10%Fat
Otu026 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae Lachnoclostridium ce4406 6 ce4406 cecum 4406 10%Fat
Otu050 Bacteria Firmicutes Clostridia Lachnospirales Lachnospiraceae uncultured ce4406 0 ce4406 cecum 4406 10%Fat
# Plot Density of phyla
pseq <- microBut
# check 10%Fat
control_ps <- subset_samples(pseq, condition=="10%Fat")
p_hc <- taxa_distribution(control_ps) + 
  theme_biome_utils() + 
  labs(title = "10%Fat")

# check Oleate
oleate_ps <- subset_samples(pseq, condition=="Oleate")
p_oleate <- taxa_distribution(oleate_ps) + 
  theme_biome_utils() + 
  labs(title = "Oleate")

# check 60%Fat
HFD_ps <- subset_samples(pseq, condition=="60%Fat")
p_hfd <- taxa_distribution(HFD_ps) + 
  theme_biome_utils() + 
  labs(title = "60%Fat")

# harnessing the power of patchwork
p_hc / p_oleate / p_hfd + plot_layout(guides = "collect") + 
  plot_annotation(tag_levels = "A")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 31 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Removed 1 row(s) containing missing values (geom_path).

# Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.1.1             ggpubr_0.4.0               
##  [3] RColorBrewer_1.1-2          viridis_0.5.1              
##  [5] viridisLite_0.3.0           microbiomeutilities_1.00.15
##  [7] dplyr_1.0.4                 microbiome_1.12.0          
##  [9] VennDiagram_1.6.20          futile.logger_1.4.3        
## [11] microbial_0.0.17            DESeq2_1.30.1              
## [13] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [15] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [17] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [19] IRanges_2.24.1              S4Vectors_0.28.1           
## [21] BiocGenerics_0.36.0         ape_5.4-1                  
## [23] ellipse_0.4.2               clustsig_1.1               
## [25] knitr_1.31                  directlabels_2021.1.13     
## [27] vegan_2.5-7                 lattice_0.20-41            
## [29] permute_0.9-5               plyr_1.8.6                 
## [31] ggplot2_3.3.3               phyloseq_1.34.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1             backports_1.2.1          fastmatch_1.1-0         
##   [4] igraph_1.2.6             splines_4.0.3            BiocParallel_1.24.1     
##   [7] digest_0.6.27            foreach_1.5.1            htmltools_0.5.1.1       
##  [10] fansi_0.4.2              magrittr_2.0.1           memoise_2.0.0           
##  [13] cluster_2.1.1            DECIPHER_2.18.1          openxlsx_4.2.3          
##  [16] limma_3.46.0             Biostrings_2.58.0        annotate_1.68.0         
##  [19] RcppParallel_5.0.3       prettyunits_1.1.1        jpeg_0.1-8.1            
##  [22] colorspace_2.0-0         ggrepel_0.9.1            blob_1.2.1              
##  [25] haven_2.3.1              xfun_0.21                crayon_1.4.1            
##  [28] RCurl_1.98-1.2           jsonlite_1.7.2           genefilter_1.72.1       
##  [31] dada2_1.18.0             phangorn_2.5.5           survival_3.2-7          
##  [34] iterators_1.0.13         glue_1.4.2               gtable_0.3.0            
##  [37] zlibbioc_1.36.0          XVector_0.30.0           DelayedArray_0.16.1     
##  [40] car_3.0-10               Rhdf5lib_1.12.1          abind_1.4-5             
##  [43] scales_1.1.1             pheatmap_1.0.12          futile.options_1.0.1    
##  [46] DBI_1.1.1                edgeR_3.32.1             rstatix_0.7.0           
##  [49] Rcpp_1.0.6               xtable_1.8-4             progress_1.2.2          
##  [52] foreign_0.8-81           bit_4.0.4                httr_1.4.2              
##  [55] ellipsis_0.3.1           farver_2.0.3             pkgconfig_2.0.3         
##  [58] XML_3.99-0.5             sass_0.3.1               locfit_1.5-9.4          
##  [61] utf8_1.1.4               labeling_0.4.2           tidyselect_1.1.0        
##  [64] rlang_0.4.10             reshape2_1.4.4           AnnotationDbi_1.52.0    
##  [67] munsell_0.5.0            cellranger_1.1.0         tools_4.0.3             
##  [70] cachem_1.0.4             cli_2.3.1                generics_0.1.0          
##  [73] RSQLite_2.2.3            ade4_1.7-16              broom_0.7.5             
##  [76] evaluate_0.14            biomformat_1.18.0        stringr_1.4.0           
##  [79] fastmap_1.1.0            yaml_2.2.1               bit64_4.0.5             
##  [82] zip_2.1.1                randomForest_4.6-14      purrr_0.3.4             
##  [85] nlme_3.1-152             formatR_1.7              rstudioapi_0.13         
##  [88] compiler_4.0.3           curl_4.3                 png_0.1-7               
##  [91] ggsignif_0.6.1           tibble_3.0.6             geneplotter_1.68.0      
##  [94] bslib_0.2.4              stringi_1.5.3            highr_0.8               
##  [97] forcats_0.5.1            Matrix_1.3-2             multtest_2.46.0         
## [100] vctrs_0.3.6              pillar_1.5.0             lifecycle_1.0.0         
## [103] rhdf5filters_1.2.0       jquerylib_0.1.3          data.table_1.14.0       
## [106] bitops_1.0-6             R6_2.5.0                 latticeExtra_0.6-29     
## [109] hwriter_1.3.2            ShortRead_1.48.0         gridExtra_2.3           
## [112] rio_0.5.16               codetools_0.2-18         lambda.r_1.2.4          
## [115] MASS_7.3-53.1            assertthat_0.2.1         rhdf5_2.34.0            
## [118] withr_2.4.1              GenomicAlignments_1.26.0 Rsamtools_2.6.0         
## [121] GenomeInfoDbData_1.2.4   mgcv_1.8-34              hms_1.0.0               
## [124] gghalves_0.1.1           quadprog_1.5-8           tidyr_1.1.2             
## [127] rmarkdown_2.7            carData_3.0-4            Rtsne_0.15